# Set working directory to main replication folder
setwd("C:/Users/Eric/Dropbox/Book/analysis/replication/")

# Load packages
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyverse)
library(quanteda)
library(stargazer)
library(lmtest)
library(sandwich)

# Load functions
source("./function_code/korea_coef_plot.R")
source("./function_code/korea_heat_map.R")

# Load data
dd = read.csv("./data/korea_report_data.csv", stringsAsFactors = F)
dd$date = as.Date(dd$date)

sp = read.csv("./data/korea_speech_data.csv", stringsAsFactors = F)
sp$date = as.Date(sp$date)
sp$meetTypeMain = factor(sp$meetTypeMain, levels=c("Plenary", "Sub-Delegation", "Staff", "Other"))
sp$speaker = factor(sp$speaker, levels=c("UNC", "Communists"))


## Check predictions for statements in the chapter
sp |> filter(grepl("I would like to take up the arguments in regard to Kaesong", speech)) |> dplyr::select(meanClass)#, meanProb)
sp |> filter(grepl("common criminals or persons who through ignorance or stupidity", speech)) |> dplyr::select(meanClass)#, meanProb)
sp |> filter(grepl("insert this phrase in the sentence", speech)) |> dplyr::select(meanClass)#, meanProb)
sp |> filter(grepl("resorted on the other hand to fabrication and calumny", speech)) |> dplyr::select(meanClass)#, meanProb)

# Distribution of the sincerity measure for speech acts
table(sp$meanClass)
round(prop.table(table(sp$meanClass)), 2)


#### TABLE 6.3: Distribution of sincere negotiation behavior by delegation and meeting level ####
behlev = sp |> dplyr::group_by(phase, meetTypeMain, speaker, date) |> 
  dplyr::summarize(propsin=mean(meanClass=="Sincere"))
behlev |> group_by(speaker, meetTypeMain) |> summarize(mean(propsin))


## Compare meeting types in binary fashion
mtdata = sp |> dplyr::group_by(meetTypeMain, speaker, date) |> dplyr::summarize(meanClass = mean(meanClass=="Sincere"))

t.test(mtdata$meanClass[which(mtdata$meetTypeMain=="Plenary" & mtdata$speaker=="UNC")],
       mtdata$meanClass[which(mtdata$meetTypeMain=="Staff" & mtdata$speaker=="UNC")])
t.test(mtdata$meanClass[which(mtdata$meetTypeMain=="Sub-Delegation" & mtdata$speaker=="UNC")],
       mtdata$meanClass[which(mtdata$meetTypeMain=="Staff" & mtdata$speaker=="UNC")])

t.test(mtdata$meanClass[which(mtdata$meetTypeMain=="Plenary" & mtdata$speaker=="Communists")],
       mtdata$meanClass[which(mtdata$meetTypeMain=="Staff" & mtdata$speaker=="Communists")])
t.test(mtdata$meanClass[which(mtdata$meetTypeMain=="Sub-Delegation" & mtdata$speaker=="Communists")],
       mtdata$meanClass[which(mtdata$meetTypeMain=="Staff" & mtdata$speaker=="Communists")])

meetsincere = sp |> dplyr::group_by(meetID) |> dplyr::summarize(meetsincere=mean(meanClass=="Sincere")) # Daily level of sincerity
lowSpeech = quantile(meetsincere$meetsincere, 0.05, na.rm=T)
lowMeets = meetsincere$meetID[which(meetsincere$meetsincere <= lowSpeech)]

bads = sp[which(sp$meetID %in% lowMeets),]
bads$meetID


#### What are the lowest moments in the talks? ####
daysincere = sp |> dplyr::group_by(date) |> dplyr::summarize(daySincere=mean(meanClass=="Sincere")) # Daily level of sincerity
lowSpeech = quantile(daysincere$daySincere, 0.05, na.rm=T)
lowDays = daysincere$date[which(daysincere$daySincere <= lowSpeech)]

bads = sp[which(sp$date %in% lowDays),]
bads$meetIDitem = paste(bads$meetID, bads$item, sep="-")

# What are the specific days with most insincerity?
unique(bads$date) # 24 days

# Which meetings are involved in these days?
unique(bads$meetIDitem)  # Any day without an item is also about Item 4 (just not in a subdelegation or staff meeting)


#### How are speech acts relating to the proposal of the USSR as a neutral nation? ####
sp |> filter(grepl("namely, the Soviet Union, Poland, and Czechoslovakia", speech)) |> dplyr::select(meanClass)#, meanProb) # Identify the speech act with the proposal

# Identify speech acts before and after this proposal in Item 3 meetings
before_ussr = sp |> filter(date<"1952-02-16" & item=="Item 3") |> filter(grepl("neutral", speech, ignore.case = T))
before_ussr$time = "before"
after_ussr = sp |> filter(date>="1952-02-16" & item=="Item 3") |> filter(grepl("neutral", speech, ignore.case = T))
after_ussr$time = "after"
ussr_test = rbind(before_ussr, after_ussr)

# What are rates of sincerity?
round(prop.table(table(ussr_test$time, ussr_test$meanClass), 1), 3)

# Are these differences statistically significant?
chisq.test(table(ussr_test$time, ussr_test$meanClass))  # p << 0.01


#### FIGURE 6.2: Gains and losses for the UNC over the course of the Korean War ####
pn = data.frame(date=rep(dd$date, 2),
                measure=c(dd$dayPos60, dd$dayNeg60),
                move=c(rep("Gain", nrow(dd)), rep("Loss", nrow(dd))))

pnPlot = ggplot(pn, aes(x=date, y=measure)) + geom_line(aes(color=move), lineend="round", linewidth=0.7) + xlab("Year") +
  ylab("Recent movement\n(proportion of terms)") + theme_bw() +
  ggplot2::annotate("rect", xmin=as.Date("1951-07-09"), xmax=as.Date("1951-08-23"), 
                    ymin=min(pn$measure, na.rm=T)-0.1, ymax=max(pn$measure, na.rm=T)+0.1,
                    alpha=0.25, fill="gray40") +
  ggplot2::annotate("rect", xmin=as.Date("1951-10-25"), xmax=as.Date("1952-10-08"), 
                    ymin=min(pn$measure, na.rm=T)-0.1, ymax=max(pn$measure, na.rm=T)+0.1,
                    alpha=0.25, fill="gray40") +
  ggplot2::annotate("rect", xmin=as.Date("1953-04-08"), xmax=as.Date("1953-06-30"), 
                    ymin=min(pn$measure, na.rm=T)-0.1, ymax=max(pn$measure, na.rm=T)+0.1,
                    alpha=0.25, fill="gray40") +
  scale_color_manual("", values=c("gray10", "gray40")) +
  theme(legend.position="bottom") +
  coord_cartesian(xlim=c(as.Date("1950-08-01"), as.Date("1953-07-01")),
                  ylim=c(min(pn$measure, na.rm=T), max(pn$measure, na.rm=T)))
pnPlot

ggsave("./figures/figure_6.2.pdf", width=4.5, height=2.5)


#### FIGURE 6.3: UNC casualties and Communist POWs over the course of the Korean War ####
dd |> pivot_longer(cols=c("rollUNCLoss60l", "rollPOW60l"), names_to="cas_type", values_to="cas_count") |> 
  mutate(cas_type=factor(cas_type, levels=c("rollPOW60l", "rollUNCLoss60l"))) |>
  ggplot(aes(date, cas_count)) + xlab("Year") + ylab("Recent costs\n(troops, logged values)") + 
  ggplot2::annotate("rect", xmin=as.Date("1951-07-09"), xmax=as.Date("1951-08-23"), 
                    ymin=min(dd$cas_count, na.rm=T), ymax=max(dd$cas_count, na.rm=T),
                    alpha=0.25, fill="gray40") +
  ggplot2::annotate("rect", xmin=as.Date("1951-10-25"), xmax=as.Date("1952-10-08"), 
                    ymin=min(dd$cas_count, na.rm=T), ymax=max(dd$cas_count, na.rm=T),
                    alpha=0.25, fill="gray40") +
  ggplot2::annotate("rect", xmin=as.Date("1953-04-08"), xmax=as.Date("1953-06-30"), 
                    ymin=min(dd$cas_count, na.rm=T), ymax=max(dd$cas_count, na.rm=T),
                    alpha=0.25, fill="gray40") +
  geom_line(aes(color=fct_rev(cas_type)), linewidth=0.7, lineend="round") +
  theme_bw() +
  theme(legend.position = "bottom") +
  scale_color_manual("", values=c("gray10", "gray40"), labels=c("UNC\ncasualties", "Communist\nPOWs")) +
  coord_cartesian(xlim=c(as.Date("1950-08-01"), as.Date("1953-07-01")))

ggsave("./figures/figure_6.3.pdf", width=4.5, height=2.5)


#### FIGURE 6.4: Smoothed negotiation behavior (excluding control, liaison, and investigatory meetings) ####
behlev2 = behlev |> filter(meetTypeMain!="Other", phase!="None")
behlev2$group = paste(behlev2$speaker, behlev2$phase, sep=" ")

ggplot(behlev2, aes(date, propsin)) + 
  geom_smooth(aes(color=speaker, fill=group), method="loess", span=0.5, se=F, linewidth=0.7, lineend="round") + 
  xlab("Year") + ylab("Sincere behavior\n(proportion of statements)") + 
  scale_fill_manual("", labels=c("Communists", "Communists", "Communists", "UNC", "UNC", "UNC"),
                    values=c("gray10", "gray10", "gray10", "gray50", "gray50", "gray50"), guide="none") +
  scale_color_manual("", breaks=c("Communists", "UNC"), labels=c("Communists", "UNC"), values=c("gray10", "gray50")) +
  scale_x_date(date_breaks="1 year", date_labels = "%Y") +
  theme_bw() + 
  theme(legend.position="bottom", 
        plot.title=element_text(size=10),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9))

ggsave("./figures/figure_6.4.pdf", width=4.5, height=2.5)


#### REGRESSIONS ####

# Coefficient plots become easier to read if variables are scaled
spscale = sp |> dplyr::select(meanClass, 
                              ri30, dayPos30, dayNeg30,
                              ri60, dayPos60, dayNeg60,
                              ri90, dayPos90, dayNeg90,
                              rollBothLoss30l, rollUNCLoss30l, rollPOW30l,
                              rollBothLoss60l, rollUNCLoss60l, rollPOW60l,
                              rollBothLoss90l, rollUNCLoss90l, rollPOW90l,
                              winter,
                              logRepLength, eisenhower, stalin, election,
                              logCombatSorties,
                              date, speaker, meetTypeMain, speakMeetID) |>
  mutate(ri30=scale(ri30)[,1], dayPos30=scale(dayPos30)[,1], dayNeg30=scale(dayNeg30)[,1],
         ri60=scale(ri60)[,1], dayPos60=scale(dayPos60)[,1], dayNeg60=scale(dayNeg60)[,1],
         ri90=scale(ri90)[,1], dayPos90=scale(dayPos90)[,1], dayNeg90=scale(dayNeg90)[,1],
         rollBothLoss30l=scale(rollBothLoss30l)[,1], rollUNCLoss30l=scale(rollUNCLoss30l)[,1], rollPOW30l=scale(rollPOW30l)[,1],
         rollBothLoss60l=scale(rollBothLoss60l)[,1], rollUNCLoss60l=scale(rollUNCLoss60l)[,1], rollPOW60l=scale(rollPOW60l)[,1],
         rollBothLoss90l=scale(rollBothLoss90l)[,1], rollUNCLoss90l=scale(rollUNCLoss90l)[,1], rollPOW90l=scale(rollPOW90l)[,1],
         winter=scale(winter)[,1],
         logRepLength=scale(logRepLength)[,1], 
         eisenhower=scale(eisenhower)[,1], 
         stalin=scale(stalin)[,1], 
         election=scale(election)[,1],
         logCombatSorties=scale(logCombatSorties)[,1]) 

## All speech acts
speechFit60a = glm(I(meanClass=="Sincere") ~
                    ri60 + rollBothLoss60l + 
                    winter + logRepLength + 
                    eisenhower + stalin +
                    election + I(speaker=="Communists") + 
                    factor(meetTypeMain),
                  family="binomial",
                  data=spscale)
speechFit60ar = coeftest(speechFit60a, vcov = vcovCL, cluster = ~ speakMeetID)

# Using only up to the end of 1952... get rid of the leader change variables
# and include the combat sorties variable (which disappears in 1953)
speechFit60b = glm(I(meanClass=="Sincere") ~ 
                    ri60 + rollBothLoss60l + 
                    winter + logRepLength + 
                    logCombatSorties + 
                    election + I(speaker=="Communists") + 
                    factor(meetTypeMain),
                  family="binomial",
                  data=spscale[which(spscale$date <= "1952-12-31"),])
speechFit60br = coeftest(speechFit60b, vcov = vcovCL, cluster = ~ speakMeetID)


#### FIGURE 6.5: Coefficient plots of logistic regressions concerning Korean War negotiation sincerity ####
koreaCoefPlot(speechFit60ar, speechFit60br, "Main")
file.rename("./figures/coefPlotKoreaMain.pdf",
            "./figures/figure_6.5.pdf")


#### TABLE 6.4: Predicted sincerity of negotiation behavior across minimal and maximal levels of movement and costs ####
# Note that this code also produces a heat map (Figure A15)
koreaHeatMap("ri60", "rollBothLoss60l", speechFit60a, fileID="Main")


# Speech acts were made when Eisenhower was president and Stalin was alive
sum(sp$eisenhower==1 & sp$stalin==0) # 143


#### Splitting by delegation ####

## Use only the UNC
speechFit60c = glm(I(meanClass=="Sincere") ~
                    dayNeg60 + rollUNCLoss60l + 
                    winter + logRepLength + eisenhower + stalin + election +
                    factor(meetTypeMain),
                  family="binomial",
                  data=spscale[which(spscale$speaker=="UNC"),])
speechFit60cr = coeftest(speechFit60c, vcov = vcovCL, cluster = ~ speakMeetID)


## Use only the Communists
speechFit60d = glm(I(meanClass=="Sincere") ~ 
                    dayPos60 + rollPOW60l + 
                    winter + logRepLength + eisenhower + stalin + election +
                    factor(meetTypeMain),
                  family="binomial",
                  data=spscale[which(spscale$speaker=="Communists"),])
speechFit60dr = coeftest(speechFit60d, vcov = vcovCL, cluster = ~ speakMeetID)

#### FIGURE 6.6: Coefficient plots of logistic regressions concerning Korean War negotiation behavior, disaggregated by delegation ####
koreaCoefPlot2(speechFit60cr, speechFit60dr, "Split")
file.rename("./figures/coefPlotKoreaSplit.pdf",
            "./figures/figure_6.6.pdf")


######################################################################################################


#### APPENDIX MATERIALS FOR CHAPTER 6 ####

##### APPENDIX 5.1: MODEL METRICS #####

load("./data/korea_confusion_data.RData")

###### TABLE A31: Performance metrics for candidate supervised learning models ######
round(c(confRF$overall[c("Accuracy", "Kappa")], confRF[[4]][c("Sensitivity", "Specificity")]), 3)
round(c(confGB$overall[c("Accuracy", "Kappa")], confGB[[4]][c("Sensitivity", "Specificity")]), 3)
round(c(confNN$overall[c("Accuracy", "Kappa")], confNN[[4]][c("Sensitivity", "Specificity")]), 3)
round(c(confNB$overall[c("Accuracy", "Kappa")], confNB[[4]][c("Sensitivity", "Specificity")]), 3)
round(c(confSVM$overall[c("Accuracy", "Kappa")], confSVM[[4]][c("Sensitivity", "Specificity")]), 3)


##### APPENDIX 5.2: DESCRIPTIVE STATISTICS #####

###### TABLE A32: Summary statistics for continuous variables ######
sp |> dplyr::select(ri60, rollBothLoss60l, dayPos60, dayNeg60, 
                    rollUNCLoss60l, rollPOW60l, 
                    logRepLength) |> 
  dplyr::reframe(across(everything(), function(x) {round(summary(x), 3)})) |> t()
summary(sp$logCombatSorties)

###### TABLE A33: Summary statistics for discrete variables (counts) ######
sp |> dplyr::select(meanClass, winter, eisenhower, stalin, speaker) |>
  dplyr::reframe(across(everything(), table)) |> t()
table(sp$meetTypeMain)

###### TABLE A34: Summary statistics for discrete variables (proportions) ######
sp |> dplyr::select(meanClass, winter, eisenhower, stalin, speaker) |>
  dplyr::reframe(across(everything(), function(x) {round(prop.table(table(x)), 3)})) |> t()
round(prop.table(table(sp$meetTypeMain)), 3)


##### APPENDIX 5.3: REGRESSION RESULTS ######

###### APPENDIX 5.3.1: Full 60-Day Results ######

####### TABLE A34: Full statistical results for models presented in Chapter 6 #######

# Initial results before clustered standard errors
# Note that the number of observations per model only shows up here
stargazer(speechFit60a, speechFit60b, speechFit60c, speechFit60d,   
          column.sep.width = "0pt", no.space=T, align=T, df=F,
          covariate.labels = c("Recent imbalance", "Recent casualties",
                               "Recent UNC ground losses",
                               "Recent UNC casualties",
                               "Recent Communist ground losses",
                               "Recent Communist POWs",
                               "Winter", "Report length", 
                               "Eisenhower", "Post-Stalin", "Combat sorties",  "1952 election", 
                               "Communist delegation",
                               "Sub-delegation level", "Staff level", "Other levels"))

# Results after clustered standard errors
# Note that the number of observations per model does not show up here
stargazer(speechFit60ar, speechFit60br, speechFit60cr, speechFit60dr,   
          column.sep.width = "0pt", no.space=T, align=T, df=F,
          covariate.labels = c("Recent imbalance", "Recent casualties",
                               "Recent UNC ground losses",
                               "Recent UNC casualties",
                               "Recent Communist ground losses",
                               "Recent Communist POWs",
                               "Winter", "Report length", 
                               "Eisenhower", "Post-Stalin", "Combat sorties",  "1952 election", 
                               "Communist delegation",
                               "Sub-delegation level", "Staff level", "Other levels"))

####### FIGURE A15: Heat map of predicted probabilities using Model 1 in Table A34 #######
koreaHeatMap("ri60", "rollBothLoss60l", speechFit60a, fileID="Main")
file.rename("./figures/heatMapKoreaMain.pdf",
            "./figures/figure_a15.pdf")


###### APPENDIX 5.3.2: Full 30-Day and 90-Day Results ######

## Redo analysis with 30-day measures ##
# All speech acts
speechFit30a = glm(I(meanClass=="Sincere") ~
                     ri30 + rollBothLoss30l + 
                     winter + logRepLength + 
                     eisenhower + stalin +
                     election + I(speaker=="Communists") + 
                     factor(meetTypeMain),
                  family="binomial",
                  data=spscale)
speechFit30ar = coeftest(speechFit30a, vcov = vcovCL, cluster = ~ speakMeetID)

# Using only up to the end of 1952
speechFit30b = glm(I(meanClass=="Sincere") ~ 
                    ri30 + rollBothLoss30l + 
                     winter + logRepLength + 
                     logCombatSorties + 
                     election + I(speaker=="Communists") + 
                     factor(meetTypeMain),
                  family="binomial",
                  data=spscale[which(spscale$date <= "1952-12-31"),])
speechFit30br = coeftest(speechFit30b, vcov = vcovCL, cluster = ~ speakMeetID)


## Redo analysis with 90-day measures ##

# All speech acts
speechFit90a = glm(I(meanClass=="Sincere") ~
                     ri90 + rollBothLoss90l + 
                     winter + logRepLength + 
                     eisenhower + stalin +
                     election + I(speaker=="Communists") + 
                     factor(meetTypeMain),
                   family="binomial",
                   data=spscale)
speechFit90ar = coeftest(speechFit90a, vcov = vcovCL, cluster = ~ speakMeetID)

# Using only up to the end of 1952
speechFit90b = glm(I(meanClass=="Sincere") ~ 
                     ri90 + rollBothLoss90l + 
                     winter + logRepLength + 
                     logCombatSorties + 
                     election + I(speaker=="Communists") + 
                     factor(meetTypeMain),
                   family="binomial",
                   data=spscale[which(spscale$date <= "1952-12-31"),])
speechFit90br = coeftest(speechFit90b, vcov = vcovCL, cluster = ~ speakMeetID)


####### TABLE A35: Full statistical results for models using 30-day and 90-day measures of battlefield activity #######

# Initial results before clustered standard errors
# Note that the number of observations per model only shows up here
stargazer(speechFit30a, speechFit30b, speechFit90a, speechFit90b, 
          column.sep.width = "0pt", no.space=T, align=T, df=F,
          covariate.labels = c("Recent imbalance (30-day)", "Recent casualties (30-day)",
                               "Recent imbalance (90-day)", "Recent casualties (90-day)",
                               "Winter", "Report length", 
                               "Eisenhower", "Post-Stalin", "Combat sorties",  "1952 election", 
                               "Communist delegation",
                               "Sub-delegation level", "Staff level", "Other levels"))

# Results after clustered standard errors
# Note that the number of observations per model does not show up here
stargazer(speechFit30ar, speechFit30br, speechFit90ar, speechFit90br,  
          column.sep.width = "0pt", no.space=T, align=T, df=F,
          covariate.labels = c("Recent imbalance (30-day)", "Recent casualties (30-day)",
                               "Recent imbalance (90-day)", "Recent casualties (90-day)",
                               "Winter", "Report length", 
                               "Eisenhower", "Post-Stalin", "Combat sorties",  "1952 election", 
                               "Communist delegation",
                               "Sub-delegation level", "Staff level", "Other levels"))

####### FIGURE A16: Coefficient plots of logistic regressions concerning Korean War negotiation sincerity, using a 30-day temporal window #######
koreaCoefPlot(speechFit30ar, speechFit30br, "Main30")
file.rename("./figures/coefPlotKoreaMain30.pdf",
            "./figures/figure_a16.pdf")

####### FIGURE A17: Heat map of predicted probabilities using Model 1 in Table A35 (30-day window) #######
koreaHeatMap("ri30", "rollBothLoss30l", speechFit30a, data=spscale, fileID="Main30")
file.rename("./figures/heatMapKoreaMain30.pdf",
            "./figures/figure_a17.pdf")

####### FIGURE A18: Coefficient plots of logistic regressions concerning Korean War negotiation sincerity, using a 90-day temporal window #######
koreaCoefPlot(speechFit90ar, speechFit90br, "Main90")
file.rename("./figures/coefPlotKoreaMain90.pdf",
            "./figures/figure_a18.pdf")

####### FIGURE A19: Heat map of predicted probabilities using Model 3 in Table A35 (90-day window) #######
koreaHeatMap("ri90", "rollBothLoss90l", speechFit30a, data=spscale, fileID="Main90")
file.rename("./figures/heatMapKoreaMain90.pdf",
            "./figures/figure_a19.pdf")